# coding: utf-8
import urllib
import urllib2
import cookielib
from HTMLParser import HTMLParser
import re
import xlrd
import xlwt
import string
from urllib2 import Request, urlopen, URLError, HTTPError
import requests


class myHtmlParser(HTMLParser):
    irstart=0
    irend=0
    result=[]
    data=[]
    isfinids=0
    iscleasspace=0
    def __init__(self):
        HTMLParser.__init__(self)
        self.flag=None
    def handle_starttag(self,tag,attrs):
        if self.irstart==1:
            if tag=='a':
                self.flag='a'
                for href,link in attrs:
                    if href=='href':
                        self.result.append(link)
                        print link







    def handle_data(self,data):
        if self.irend==1:
            sdata = data.strip()
            if sdata!='':
                if self.iscleasspace==1:
                    sdata=sdata.replace("\r\n","")
                    sdata=sdata.replace(" ","")
                    sdata=sdata.replace(' ','')
                self.data.append(sdata)
                print '==========:'+sdata

#===========================================================================================
findIndex =1
urlList=[]
pagecount = 1

while findIndex<=pagecount:

    cookie = cookielib.CookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    url='http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/ProjectInfoList.aspx'

    values={
        'txtXMMC':'',
        'txtYSZH':'',
        'txtKFS':'',
        'txtXMDZ':'',
        'PageNavigator1$24txtNewPageIndex':str(findIndex),
        'hdSelectedQY':'04'
    }
    data = urllib.urlencode(values)
    req = urllib2.Request(url,data)
    req.add_header('Host','fsfc.fsjw.gov.cn')
    req.add_header('Referer','http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/ProjectInfoList.aspx')
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0')
    response = opener.open(req)
    the_page=response.read()
    response.close()
    m= myHtmlParser()
    m.irstart=1
    m.isfinids=1
    m.result=[]
    m.feed(the_page)
    m.close()
    findIndex+=1
    i=0
    while i<len(m.result):
        mstr=m.result[i]
        if mstr.find('ProjectDetailsInfo.aspx?')>=0:
            urlList.append(mstr)
        i+=1
    #设置
    #http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/ProjectDetailsInfo.aspx?CODE=43054&dblx=2
    #http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/ProjectDetailsInfo.aspx?CODE=43508&dblx=2
#=============================================================

print '创建表格'
f= xlwt.Workbook()#创建工作簿
sheetname = "Sheet1"
sheet = f.add_sheet(sheetname,cell_overwrite_ok=True)
#头
sheet.write(0,0,u'开发商')
sheet.write(0,1,u'项目名称')
sheet.write(0,2,u'项目地址')
sheet.write(0,3,u'行政区划')
sheet.write(0,4,u'总建筑面积')
sheet.write(0,5,u'容积率')
sheet.write(0,6,u'资质证书编号')
sheet.write(0,7,u'资质等级')




#=============================================================
print str(len(urlList))
i=0
while i<1:
    print '=============================================================================='
    #http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/ProjectDetailsInfo.aspx?CODE=43508&dblx=2
    url='http://fsfc.fsjw.gov.cn/Templets/FoShan/aspx/hpms/'+urlList[i]
    #
    print u'读取网址:'+url
    req = urllib2.Request(url)
    req.add_header('Host','fsfc.fsjw.gov.cn')
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0')
    try:
        response = opener.open(req)
    except :
        print u'读取网站出错了'
        continue
    strinfo= response.read()
    print strinfo
    i+=1
    m= myHtmlParser()
    m.data=[]
    m.irend=1
    m.isfinids=1
    m.result=[]
    m.feed(strinfo)
    m.close()
    #获取房地产信息



    #删除换行，空格
    strinfo=strinfo.replace("\r\n","")
    strinfo=strinfo.replace(" ","")
    p=re.compile('\s+')
    strinfo=re.sub(p,'',strinfo)
    #获取已售总套数 lblZZYSZMJ
    finLs=[]
    finLs= strinfo.split('"lblZZYSZTS">')
    if len(finLs)==2:
        print "==============sssssssssssssssssssss=================="
        finLs = finLs[1].split('</span>')
        lksd = finLs[0]
        if lksd=='':
            lksd='0'
        print u'已售总套数'+lksd

    finLs=[]
    finLs= strinfo.split('"lblZZYSZMJ">')
    if len(finLs)==2:
        print "==============sssssssssssssssssssss=================="
        finLs = finLs[1].split('</span>')
        lksd = finLs[0]
        if lksd=='':
            lksd='0'
        print u'已售总面积'+lksd

    finLs=[]
    finLs= strinfo.split('"lblZZYSJJ">')
    if len(finLs)==2:
        print "==============sssssssssssssssssssss=================="
        finLs = finLs[1].split('</span>')
        lksd = finLs[0]
        if lksd=='':
            lksd='0'
        print u'已售均价'+lksd

    finLs=[]
    finLs= strinfo.split('"lblZZWSTS">')
    if len(finLs)==2:
        print "==============sssssssssssssssssssss=================="
        finLs = finLs[1].split('</span>')
        lksd = finLs[0]
        if lksd=='':
            lksd='0'
        print u'未售套数'+lksd

    finLs=[]
    finLs= strinfo.split('"lblZZWSMJ">')
    if len(finLs)==2:
        print "==============sssssssssssssssssssss=================="
        finLs = finLs[1].split('</span>')
        lksd = finLs[0]
        if lksd=='':
            lksd='0'
        print u'未售面积'+lksd

f.save( u'高明商品房信息.xls')
